# Computations
import numpy as np
import pandas as pd
# scipy
from scipy.stats import norm
# preprocessing
from sklearn import preprocessing
import re
# Visualisation libraries
## Text
from colorama import Fore, Back, Style
from IPython.display import Image, display, Markdown, Latex, clear_output
## progressbar
import progressbar
## plotly
from plotly.offline import init_notebook_mode, iplot
import plotly.graph_objs as go
import plotly.offline as py
from plotly.subplots import make_subplots
import plotly.express as px
## seaborn
import seaborn as sns
## matplotlib
import matplotlib.pyplot as plt
from matplotlib.patches import Ellipse, Polygon
from matplotlib.font_manager import FontProperties
import matplotlib.colors as mcolors
plt.style.use('seaborn-whitegrid')
plt.rcParams['axes.labelsize'] = 14
plt.rcParams['xtick.labelsize'] = 12
plt.rcParams['ytick.labelsize'] = 12
plt.rcParams['text.color'] = 'k'
%matplotlib inline
## WordCloud
from wordcloud import WordCloud
import warnings
warnings.filterwarnings("ignore")
In this study, we analyze HR data available from kaggle.com
This data is fictional and it is created by IBM data scientists.
Categorical Parameters:
| 1 | 2 | 3 | 4 | 5 | |
|---|---|---|---|---|---|
| Education | Below College | College | Bachelor | Master | Doctor |
| Environment Satisfaction | Low | Medium | High | Very High | |
| Job Involvement | Low | Medium | High | Very High | |
| Job Satisfaction | Low | Medium | High | Very High | |
| Performance Rating | Low | Good | Excellent | Outstanding | |
| Relationship Satisfaction | Low | Medium | High | Very High | |
| WorkLife Balance | Bad | Good | Better | Best |
This can be encoded as follows,
Categorical_Dict = {'Education': {1:'Below College', 2:'College',3:'Bachelor', 4: 'Master', 5:'Doctor'},
'Environment Satisfaction': {1:'Low', 2:'Medium', 3:'High', 4:'Very High'},
'Job Involvement': {1:'Low', 2:'Medium', 3:'High', 4:'Very High'},
'Job Satisfaction': {1:'Low', 2:'Medium', 3:'High', 4:'Very High'},
'Performance Rating': {1:'Low', 2:'Good', 3:'Excellent', 4:'Outstanding'},
'Relationship Satisfaction': {1:'Low', 2:'Medium', 3:'High', 4:'Very High'},
'Work Life Balance': {1:'Bad', 2:'Good', 3:'Better', 4:'Best'}}
Path = 'Data/WA_Fn-UseC_-HR-Employee-Attrition.xlsx'
Data = pd.read_excel(Path)
Temp = [re.sub(r"(\w)([A-Z])", r"\1 \2", x) for x in Data.columns]
Temp = [x.replace(' Curr ', ' Current ').replace('18',' 18').replace('Num ','Number Of ') for x in Temp]
Data.columns = Temp
del Temp
Data['Business Travel'] = Data['Business Travel'].str.replace('_',' ')
display(Data.head(8).style.hide_index())
Target = 'Attrition'
Featured_Columns = list(set(Data.columns) - {Target, 'Employee Number'})
| Age | Attrition | Business Travel | Daily Rate | Department | Distance From Home | Education | Education Field | Employee Count | Employee Number | Environment Satisfaction | Gender | Hourly Rate | Job Involvement | Job Level | Job Role | Job Satisfaction | Marital Status | Monthly Income | Monthly Rate | Number Of Companies Worked | Over 18 | Over Time | Percent Salary Hike | Performance Rating | Relationship Satisfaction | Standard Hours | Stock Option Level | Total Working Years | Training Times Last Year | Work Life Balance | Years At Company | Years In Current Role | Years Since Last Promotion | Years With Current Manager |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 41 | Yes | Travel Rarely | 1102 | Sales | 1 | 2 | Life Sciences | 1 | 1 | 2 | Female | 94 | 3 | 2 | Sales Executive | 4 | Single | 5993 | 19479 | 8 | Y | Yes | 11 | 3 | 1 | 80 | 0 | 8 | 0 | 1 | 6 | 4 | 0 | 5 |
| 49 | No | Travel Frequently | 279 | Research & Development | 8 | 1 | Life Sciences | 1 | 2 | 3 | Male | 61 | 2 | 2 | Research Scientist | 2 | Married | 5130 | 24907 | 1 | Y | No | 23 | 4 | 4 | 80 | 1 | 10 | 3 | 3 | 10 | 7 | 1 | 7 |
| 37 | Yes | Travel Rarely | 1373 | Research & Development | 2 | 2 | Other | 1 | 4 | 4 | Male | 92 | 2 | 1 | Laboratory Technician | 3 | Single | 2090 | 2396 | 6 | Y | Yes | 15 | 3 | 2 | 80 | 0 | 7 | 3 | 3 | 0 | 0 | 0 | 0 |
| 33 | No | Travel Frequently | 1392 | Research & Development | 3 | 4 | Life Sciences | 1 | 5 | 4 | Female | 56 | 3 | 1 | Research Scientist | 3 | Married | 2909 | 23159 | 1 | Y | Yes | 11 | 3 | 3 | 80 | 0 | 8 | 3 | 3 | 8 | 7 | 3 | 0 |
| 27 | No | Travel Rarely | 591 | Research & Development | 2 | 1 | Medical | 1 | 7 | 1 | Male | 40 | 3 | 1 | Laboratory Technician | 2 | Married | 3468 | 16632 | 9 | Y | No | 12 | 3 | 4 | 80 | 1 | 6 | 3 | 3 | 2 | 2 | 2 | 2 |
| 32 | No | Travel Frequently | 1005 | Research & Development | 2 | 2 | Life Sciences | 1 | 8 | 4 | Male | 79 | 3 | 1 | Laboratory Technician | 4 | Single | 3068 | 11864 | 0 | Y | No | 13 | 3 | 3 | 80 | 0 | 8 | 2 | 2 | 7 | 7 | 3 | 6 |
| 59 | No | Travel Rarely | 1324 | Research & Development | 3 | 3 | Medical | 1 | 10 | 3 | Female | 81 | 4 | 1 | Laboratory Technician | 1 | Married | 2670 | 9964 | 4 | Y | Yes | 20 | 4 | 1 | 80 | 3 | 12 | 3 | 2 | 1 | 0 | 0 | 0 |
| 30 | No | Travel Rarely | 1358 | Research & Development | 24 | 1 | Life Sciences | 1 | 11 | 4 | Male | 67 | 3 | 1 | Laboratory Technician | 3 | Divorced | 2693 | 13335 | 1 | Y | No | 22 | 4 | 2 | 80 | 1 | 1 | 2 | 3 | 1 | 0 | 0 | 0 |
First off, let's take a look at the dataset
def Data_Plot(Inp, W = False):
data_info = Inp.copy()
data_info = Inp.dtypes.astype(str).to_frame(name='Data Type')
Temp = Inp.isnull().sum().to_frame(name = 'Number of NaN Values')
data_info = data_info.join(Temp, how='outer')
data_info ['Size'] = Inp.shape[0]
data_info['Percentage'] = 100 - np.round(100*(data_info['Number of NaN Values']/Inp.shape[0]),2)
data_info = data_info.reset_index(drop = False).rename(columns = {'index':'Features'})
#
fig = px.bar(data_info, x= 'Features', y= 'Percentage', color = 'Data Type', text = 'Data Type',
color_discrete_sequence = ['PaleGreen', 'LightCyan', 'PeachPuff', 'Pink', 'Plum'],
hover_data = data_info.columns)
fig.update_layout(plot_bgcolor= 'white', legend=dict(x=1.01, y=.5, traceorder="normal",
bordercolor="DarkGray", borderwidth=1))
fig.update_traces(texttemplate= 6*' ' + '%{label}', textposition='inside')
fig.update_traces(marker_line_color= 'Black', marker_line_width=1., opacity=1)
if W:
fig.update_layout(width = W)
fig.show()
return data_info
_ = Data_Plot(Data)
Moreover,
def Distinct_Observations(Inp, Target = Target, Featured_Columns = None, YL = None):
if Featured_Columns == None:
Featured_Columns = list(set(Data.columns) - {Target})
Temp = Inp[Featured_Columns].nunique()
fig = go.Figure([go.Bar(x=Temp.index, y=Temp.values)])
fig.update_traces(marker_line_color= 'Navy', marker_line_width=1, opacity=1, showlegend = False)
fig.update_layout(legend_orientation='v', plot_bgcolor= 'white', height= 450, width= 980,
title={'text': '<b>' + 'Distinct Observations in Each Column' + '<b>', 'x':0.5,
'y': 0.92, 'xanchor': 'center', 'yanchor': 'top'},
yaxis_title='Frequency')
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=True, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray')
if not YL == None:
fig.update_yaxes(range =[0, YL])
fig.show()
return Temp
Temp = Distinct_Observations(Inp = Data, Featured_Columns = Featured_Columns, YL = 1500)
Temp.loc[Temp>5].sort_index().to_frame('Distinct Observations')
| Distinct Observations | |
|---|---|
| Age | 43 |
| Daily Rate | 886 |
| Distance From Home | 29 |
| Education Field | 6 |
| Hourly Rate | 71 |
| Job Role | 9 |
| Monthly Income | 1349 |
| Monthly Rate | 1427 |
| Number Of Companies Worked | 10 |
| Percent Salary Hike | 15 |
| Total Working Years | 40 |
| Training Times Last Year | 7 |
| Years At Company | 37 |
| Years In Current Role | 19 |
| Years Since Last Promotion | 16 |
| Years With Current Manager | 18 |
def FeatBins(Inp, Bins):
Out = pd.cut(Inp, bins = pd.IntervalIndex.from_tuples([(x, y) for x, y in zip(Bins[:-1],Bins[1:])]))
Temp = np.sort(Out.astype('str').unique())
Dict = dict(zip(Temp, np.arange(len(Temp))))
Out = Out.astype('str').replace(Dict)
return Out
def FeatAgg(Feat, ColorFeat, Target = Target, Inp = Data):
Out = Inp[[Feat, ColorFeat,Target]]
Out = Out.groupby([Feat, ColorFeat,Target])[Target].agg({'count'}).rename(columns = {'count':'Count'})
Out['Percentage'] = np.round(100* Out.values /Out.sum().values, 2)
Out.reset_index(drop = False, inplace = True)
Out = Out.sort_values(by=[Feat])
Out[Feat] = Out[Feat].astype(str)
return Out
def DistPlot(Feat, Target = Target, nbins = 20,
Colors = ['LightSalmon', 'LightBlue'], LC = 'Black',
yLim = [0, 80], H = 450, titleY = 0.92, Inp = Data):
fig = px.histogram(Inp, x = Feat, nbins=nbins, color= Target, marginal= 'box',
color_discrete_sequence= Colors, hover_data=Data.columns)
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=True, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray')
Name = '%s Distribution by %s' % (Target, Feat)
fig.update_layout(legend_orientation='v', plot_bgcolor= 'white', height= H, width= 980,
title={'text': '<b>' + Name + '<b>', 'x':0.5, 'y': titleY, 'xanchor': 'center', 'yanchor': 'top'},
yaxis_title='Frequency')
fig.update_traces(marker_line_color= LC, marker_line_width=0.5, opacity=1)
fig['layout']['yaxis'].update(range=yLim)
fig.show()
# For plotting
def FeatCut(Feat, ColorFeat, Bins, Target = Target, Inp = Data):
Out = Inp[[Feat, ColorFeat, Target]]
Out[Feat] = pd.cut(Out[Feat], bins = pd.IntervalIndex.from_tuples([(x, y) for x, y in zip(Bins[:-1],Bins[1:])]))
Out = Out.groupby([Feat, ColorFeat,Target])[Target].agg({'count'}).rename(columns = {'count':'Count'})
Out['Percentage'] = np.round(100* Out.values /Out.sum().values, 2)
Out.reset_index(drop = False, inplace = True)
Out = Out.sort_values(by=[Feat])
Out[Feat] = Out[Feat].astype(str)
return Out
def PlotX(df, Feat, ColorFeat, Target = Target,
Colors = list(mcolors.TABLEAU_COLORS.values()), LC = 'Black',
yLim = [0, 35], H = 500, titleY = 0.90):
# Figure
fig = make_subplots(rows=1, cols=2, horizontal_spacing = 0.02, shared_yaxes=True, y_title = 'Percent',
subplot_titles=('%s: <b>No<b>' % Target, '%s: <b>Yes<b>' % Target))
# Left
if Colors == None:
fig1 = px.bar(df.loc[df[Target] == 'No'], x= Feat, y= 'Percentage', orientation='v',
color = ColorFeat, text = 'Percentage', hover_data= df.columns)
else:
fig1 = px.bar(df.loc[df[Target] == 'No'], x= Feat, y= 'Percentage', orientation='v',
color = ColorFeat, text = 'Percentage', hover_data= df.columns,
color_discrete_sequence = Colors)
for i in range(len(fig1['data'])):
fig.add_trace(fig1['data'][i], row=1, col=1)
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1, row=1, col=1)
# Right
if Colors == None:
fig2 = px.bar(df.loc[df[Target] == 'Yes'], x= Feat, y= 'Percentage', orientation='v',
color = ColorFeat, text = 'Percentage', hover_data= df.columns)
else:
fig2 = px.bar(df.loc[df[Target] == 'Yes'], x= Feat, y= 'Percentage', orientation='v',
color = ColorFeat, text = 'Percentage', hover_data= df.columns,
color_discrete_sequence = Colors)
for i in range(len(fig2['data'])):
fig.add_trace(fig2['data'][i], row=1, col=2)
fig.update_traces(marker_line_color= LC, marker_line_width=1, opacity=1, showlegend = False, row=1, col=2)
# Update
fig.update_xaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=False, gridwidth=1, gridcolor='Lightgray')
fig.update_yaxes(showline=True, linewidth=1, linecolor='Lightgray', mirror=True,
zeroline=False, zerolinewidth=1, zerolinecolor='Black',
showgrid=True, gridwidth=1, gridcolor='Lightgray', range= yLim)
fig.update_layout(legend_orientation='v', legend_title_text=ColorFeat, plot_bgcolor= 'white', height= H, width= 980)
fig.update_layout(legend=dict(font=dict(color="Black"), bordercolor="Lightgray", borderwidth=1))
Name = '%s Distribution by %s and %s' % (Feat, ColorFeat, Target)
fig.update_layout(title={'text': '<b>' + Name + '<b>', 'x':0.5, 'y': titleY, 'xanchor': 'center', 'yanchor': 'top'})
fig.show()
# A copy of the dataset
df = Data.copy()
# Modifying the dataset for plotting only
for Feat in Categorical_Dict.keys():
Data [Feat] = Data[Feat].replace(Categorical_Dict[Feat])
del Feat
Feat = 'Age'
DistPlot(Feat, yLim = [0, 400])
Bins = [15, 24, 40, 59, 80]
ColorFeat = 'Gender'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 35], Colors = ['HotPink', 'RoyalBlue'])
ColorFeat = 'Education'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen'])
ColorFeat = 'Business Travel'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 40], Colors = ['Salmon','Bisque','LimeGreen'])
ColorFeat = 'Department'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 35])
ColorFeat = 'Education Field'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25])
ColorFeat = 'Environment Satisfaction'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
ColorFeat = 'Job Involvement'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 35], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
# For Preprocessing
df[Feat] = FeatBins(Inp = df[Feat], Bins = Bins)
del Feat, ColorFeat, Bins, Table
Feat = 'Business Travel'
ColorFeat = 'Gender'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 40], Colors = ['HotPink', 'RoyalBlue'])
ColorFeat = 'Education'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25], Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen'])
ColorFeat = 'Department'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 50])
ColorFeat = 'Education Field'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 30])
ColorFeat = 'Environment Satisfaction'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
ColorFeat = 'Job Involvement'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 40], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
Feat = 'Daily Rate'
DistPlot(Feat, yLim = [0, 150])
Bins = [100, 300, 600, 1000, 1500]
ColorFeat = 'Gender'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['HotPink', 'RoyalBlue'])
ColorFeat = 'Education'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 12], Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen'])
ColorFeat = 'Business Travel'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25], Colors = ['Salmon','Bisque','LimeGreen'])
ColorFeat = 'Department'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25])
ColorFeat = 'Education Field'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 14])
ColorFeat = 'Environment Satisfaction'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 12], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
ColorFeat = 'Job Involvement'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
# For Preprocessing
df[Feat] = FeatBins(Inp = df[Feat], Bins = Bins)
del Feat, ColorFeat, Bins, Table
Feat = 'Department'
ColorFeat = 'Gender'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 35], Colors = ['HotPink', 'RoyalBlue'])
ColorFeat = 'Education'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25], Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen'])
ColorFeat = 'Business Travel'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 50], Colors = ['Salmon','Bisque','LimeGreen'])
ColorFeat = 'Education Field'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 30])
ColorFeat = 'Environment Satisfaction'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
ColorFeat = 'Job Involvement'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 35], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
Feat = 'Distance From Home'
DistPlot(Feat, yLim = [0, 300])
Bins = [0, 5, 10, 20, 30]
ColorFeat = 'Gender'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25], Colors = ['HotPink', 'RoyalBlue'])
ColorFeat = 'Education'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 14], Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen'])
ColorFeat = 'Business Travel'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 30], Colors = ['Salmon','Bisque','LimeGreen'])
ColorFeat = 'Department'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25])
ColorFeat = 'Education Field'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 16])
ColorFeat = 'Environment Satisfaction'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 14], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
ColorFeat = 'Job Involvement'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
# For Preprocessing
df[Feat] = FeatBins(Inp = df[Feat], Bins = Bins)
del Feat, ColorFeat, Bins, Table
Feat = 'Education Field'
ColorFeat = 'Gender'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25], Colors = ['HotPink', 'RoyalBlue'])
ColorFeat = 'Education'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 14], Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen'])
ColorFeat = 'Business Travel'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 30], Colors = ['Salmon','Bisque','LimeGreen'])
ColorFeat = 'Department'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 30])
ColorFeat = 'Environment Satisfaction'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 12], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
ColorFeat = 'Job Involvement'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
Feat = 'Hourly Rate'
DistPlot(Feat, yLim = [0, 150])
Bins = [25, 50, 75, 101]
ColorFeat = 'Gender'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['HotPink', 'RoyalBlue'])
ColorFeat = 'Education'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 14], Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen'])
ColorFeat = 'Business Travel'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25], Colors = ['Salmon','Bisque','LimeGreen'])
ColorFeat = 'Department'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25])
ColorFeat = 'Education Field'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 14])
ColorFeat = 'Environment Satisfaction'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 12], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
ColorFeat = 'Job Involvement'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
# For Preprocessing
df[Feat] = FeatBins(Inp = df[Feat], Bins = Bins)
del Feat, ColorFeat, Bins, Table
Feat = 'Job Involvement'
ColorFeat = 'Gender'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 35], Colors = ['HotPink', 'RoyalBlue'])
ColorFeat = 'Education'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen'])
ColorFeat = 'Business Travel'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 40], Colors = ['Salmon','Bisque','LimeGreen'])
ColorFeat = 'Department'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 35])
ColorFeat = 'Education Field'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25])
ColorFeat = 'Environment Satisfaction'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
Feat = 'Job Role'
ColorFeat = 'Gender'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 12], Colors = ['HotPink', 'RoyalBlue'])
ColorFeat = 'Education'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 8], Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen'])
ColorFeat = 'Business Travel'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 14], Colors = ['Salmon','Bisque','LimeGreen'])
ColorFeat = 'Department'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20])
ColorFeat = 'Education Field'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 8])
ColorFeat = 'Environment Satisfaction'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 6], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
ColorFeat = 'Job Involvement'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 12], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
Feat = 'Job Satisfaction'
ColorFeat = 'Gender'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['HotPink', 'RoyalBlue'])
ColorFeat = 'Education'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 10], Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen'])
ColorFeat = 'Business Travel'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['Salmon','Bisque','LimeGreen'])
ColorFeat = 'Department'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20])
ColorFeat = 'Education Field'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 14])
ColorFeat = 'Environment Satisfaction'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 10], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
ColorFeat = 'Job Involvement'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
Feat = 'Marital Status'
ColorFeat = 'Gender'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25], Colors = ['HotPink', 'RoyalBlue'])
ColorFeat = 'Education'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 16], Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen'])
ColorFeat = 'Business Travel'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 30], Colors = ['Salmon','Bisque','LimeGreen'])
ColorFeat = 'Department'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 30])
ColorFeat = 'Education Field'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20])
ColorFeat = 'Environment Satisfaction'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 14], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
ColorFeat = 'Job Involvement'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
Feat = 'Monthly Income'
DistPlot(Feat, yLim = [0, 400])
Bins = [1e3, 3e3, 7e3, 1e4, 2e4]
ColorFeat = 'Gender'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25], Colors = ['HotPink', 'RoyalBlue'])
ColorFeat = 'Education'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 14], Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen'])
ColorFeat = 'Business Travel'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 30], Colors = ['Salmon','Bisque','LimeGreen'])
ColorFeat = 'Department'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25])
ColorFeat = 'Education Field'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 16])
ColorFeat = 'Environment Satisfaction'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 14], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
ColorFeat = 'Job Involvement'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
# For Preprocessing
df[Feat] = FeatBins(Inp = df[Feat], Bins = Bins)
del Feat, ColorFeat, Bins, Table
Feat = 'Monthly Rate'
DistPlot(Feat, yLim = [0, 200])
Bins = [2e3, 1e4, 2e4, 3e4]
ColorFeat = 'Gender'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['HotPink', 'RoyalBlue'])
ColorFeat = 'Education'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 14], Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen'])
ColorFeat = 'Business Travel'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25], Colors = ['Salmon','Bisque','LimeGreen'])
ColorFeat = 'Department'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25])
ColorFeat = 'Education Field'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 16])
ColorFeat = 'Environment Satisfaction'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 12], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
ColorFeat = 'Job Involvement'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
# For Preprocessing
df[Feat] = FeatBins(Inp = df[Feat], Bins = Bins)
del Feat, ColorFeat, Bins, Table
Feat = 'Number Of Companies Worked'
DistPlot(Feat, yLim = [0, 600])
Bins = [-1, 2, 5, 10]
ColorFeat = 'Gender'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 2]': '[0, 2]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 35], Colors = ['HotPink', 'RoyalBlue'])
ColorFeat = 'Education'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 2]': '[0, 2]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen'])
ColorFeat = 'Business Travel'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 2]': '[0, 2]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 35], Colors = ['Salmon','Bisque','LimeGreen'])
ColorFeat = 'Department'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 2]': '[0, 2]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 35])
ColorFeat = 'Education Field'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25])
ColorFeat = 'Environment Satisfaction'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 2]': '[0, 2]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
ColorFeat = 'Job Involvement'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 2]': '[0, 2]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 30], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
# For Preprocessing
df[Feat] = FeatBins(Inp = df[Feat], Bins = Bins)
del Feat, ColorFeat, Bins, Table
Feat = 'Over Time'
ColorFeat = 'Gender'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 40], Colors = ['HotPink', 'RoyalBlue'])
ColorFeat = 'Education'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 30], Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen'])
ColorFeat = 'Business Travel'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 50], Colors = ['Salmon','Bisque','LimeGreen'])
ColorFeat = 'Department'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 50])
ColorFeat = 'Education Field'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 30])
ColorFeat = 'Environment Satisfaction'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
ColorFeat = 'Job Involvement'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 40], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
del Feat, ColorFeat, Table
Feat = 'Percent Salary Hike'
DistPlot(Feat, yLim = [0, 250])
Bins = list(np.arange(10, 27, 4))
ColorFeat = 'Gender'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 30], Colors = ['HotPink', 'RoyalBlue'])
ColorFeat = 'Education'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen'])
ColorFeat = 'Business Travel'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 35], Colors = ['Salmon','Bisque','LimeGreen'])
ColorFeat = 'Department'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 35])
ColorFeat = 'Education Field'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20])
ColorFeat = 'Environment Satisfaction'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 16], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
ColorFeat = 'Job Involvement'
Table = FeatCut(Feat, ColorFeat, Bins)
PlotX(Table, Feat, ColorFeat, yLim = [0, 30], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
# For Preprocessing
df[Feat] = FeatBins(Inp = df[Feat], Bins = Bins)
del Feat, ColorFeat, Bins, Table
Feat = 'Performance Rating'
ColorFeat = 'Gender'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 50], Colors = ['HotPink', 'RoyalBlue'])
ColorFeat = 'Education'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 30], Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen'])
ColorFeat = 'Business Travel'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 60], Colors = ['Salmon','Bisque','LimeGreen'])
ColorFeat = 'Department'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 50])
ColorFeat = 'Education Field'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 30])
ColorFeat = 'Environment Satisfaction'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 30], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
ColorFeat = 'Job Involvement'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 50], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
del Feat, ColorFeat, Table
Feat = 'Relationship Satisfaction'
ColorFeat = 'Gender'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['HotPink', 'RoyalBlue'])
ColorFeat = 'Education'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 12], Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen'])
ColorFeat = 'Business Travel'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['Salmon','Bisque','LimeGreen'])
ColorFeat = 'Department'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20])
ColorFeat = 'Education Field'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 12])
ColorFeat = 'Environment Satisfaction'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 10], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
ColorFeat = 'Job Involvement'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
del Feat, ColorFeat, Table
Feat = 'Stock Option Level'
ColorFeat = 'Gender'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25], Colors = ['HotPink', 'RoyalBlue'])
ColorFeat = 'Education'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 15], Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen'])
ColorFeat = 'Business Travel'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 30], Colors = ['Salmon','Bisque','LimeGreen'])
ColorFeat = 'Department'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25])
ColorFeat = 'Education Field'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 16])
ColorFeat = 'Environment Satisfaction'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 14], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
ColorFeat = 'Job Involvement'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
del Feat, ColorFeat, Table
Feat = 'Total Working Years'
DistPlot(Feat, yLim = [0, 500])
Bins = [-1, 10, 20, 30, 41]
ColorFeat = 'Gender'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 10]': '[0, 10]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 35], Colors = ['HotPink', 'RoyalBlue'])
ColorFeat = 'Education'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 10]': '[0, 10]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen'])
ColorFeat = 'Business Travel'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 10]': '[0, 10]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 40], Colors = ['Salmon','Bisque','LimeGreen'])
ColorFeat = 'Department'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 10]': '[0, 10]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 35])
ColorFeat = 'Education Field'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 10]': '[0, 10]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 25])
ColorFeat = 'Environment Satisfaction'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 10]': '[0, 10]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 16], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
ColorFeat = 'Job Involvement'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 10]': '[0, 10]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 35], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
# For Preprocessing
df[Feat] = FeatBins(Inp = df[Feat], Bins = Bins)
del Feat, ColorFeat, Bins, Table
Feat = 'Training Times Last Year'
ColorFeat = 'Gender'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['HotPink', 'RoyalBlue'])
ColorFeat = 'Education'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 12], Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen'])
ColorFeat = 'Business Travel'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25], Colors = ['Salmon','Bisque','LimeGreen'])
ColorFeat = 'Department'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25])
ColorFeat = 'Education Field'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 14])
ColorFeat = 'Environment Satisfaction'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 12], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
ColorFeat = 'Job Involvement'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
del Feat, ColorFeat, Table
Feat = 'Work Life Balance'
ColorFeat = 'Gender'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 35], Colors = ['HotPink', 'RoyalBlue'])
ColorFeat = 'Education'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen'])
ColorFeat = 'Business Travel'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 40], Colors = ['Salmon','Bisque','LimeGreen'])
ColorFeat = 'Department'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 35])
ColorFeat = 'Education Field'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 25])
ColorFeat = 'Environment Satisfaction'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
ColorFeat = 'Job Involvement'
Table = FeatAgg(Feat, ColorFeat)
PlotX(Table, Feat, ColorFeat, yLim = [0, 35], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
del Feat, ColorFeat, Table
Feat = 'Years At Company'
DistPlot(Feat, yLim = [0, 600])
Bins = [-1, 10, 20, 30, 41]
ColorFeat = 'Gender'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 10]': '[0, 10]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 50], Colors = ['HotPink', 'RoyalBlue'])
ColorFeat = 'Education'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 10]': '[0, 10]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 30], Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen'])
ColorFeat = 'Business Travel'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 10]': '[0, 10]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 50], Colors = ['Salmon','Bisque','LimeGreen'])
ColorFeat = 'Department'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 10]': '[0, 10]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 50])
ColorFeat = 'Education Field'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 10]': '[0, 10]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 30])
ColorFeat = 'Environment Satisfaction'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 10]': '[0, 10]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 30], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
ColorFeat = 'Job Involvement'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 10]': '[0, 10]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 50], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
# For Preprocessing
df[Feat] = FeatBins(Inp = df[Feat], Bins = Bins)
del Feat, ColorFeat, Bins, Table
Feat = 'Years In Current Role'
DistPlot(Feat, yLim = [0, 400])
Bins = [-1, 4, 8, 12, 19]
ColorFeat = 'Gender'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 4]': '[0, 4]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 30], Colors = ['HotPink', 'RoyalBlue'])
ColorFeat = 'Education'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 4]': '[0, 4]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen'])
ColorFeat = 'Business Travel'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 4]': '[0, 4]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 40], Colors = ['Salmon','Bisque','LimeGreen'])
ColorFeat = 'Department'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 4]': '[0, 4]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 35])
ColorFeat = 'Education Field'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 4]': '[0, 4]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 25])
ColorFeat = 'Environment Satisfaction'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 4]': '[0, 4]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 16], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
ColorFeat = 'Job Involvement'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 4]': '[0, 4]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 35], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
# For Preprocessing
df[Feat] = FeatBins(Inp = df[Feat], Bins = Bins)
del Feat, ColorFeat, Bins, Table
Feat = 'Years Since Last Promotion'
DistPlot(Feat, yLim = [0, 600])
Bins = [-1, 5, 10, 16]
ColorFeat = 'Gender'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 5]': '[0, 5]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 50], Colors = ['HotPink', 'RoyalBlue'])
ColorFeat = 'Education'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 5]': '[0, 5]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 30], Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen'])
ColorFeat = 'Business Travel'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 5]': '[0, 5]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 60], Colors = ['Salmon','Bisque','LimeGreen'])
ColorFeat = 'Department'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 5]': '[0, 5]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 50])
ColorFeat = 'Education Field'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 5]': '[0, 5]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 35])
ColorFeat = 'Environment Satisfaction'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 5]': '[0, 5]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 25], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
ColorFeat = 'Job Involvement'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 5]': '[0, 5]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 50], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
# For Preprocessing
df[Feat] = FeatBins(Inp = df[Feat], Bins = Bins)
del Feat, ColorFeat, Bins, Table
Feat = 'Years With Current Manager'
DistPlot(Feat, yLim = [0, 400])
Bins = [-1, 4, 8, 12, 18]
ColorFeat = 'Gender'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 4]': '[0, 4]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 35], Colors = ['HotPink', 'RoyalBlue'])
ColorFeat = 'Education'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 4]': '[0, 4]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['LightCoral','Khaki','GreenYellow','LimeGreen','ForestGreen'])
ColorFeat = 'Business Travel'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 4]': '[0, 4]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 40], Colors = ['Salmon','Bisque','LimeGreen'])
ColorFeat = 'Department'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 4]': '[0, 4]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 35])
ColorFeat = 'Education Field'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 4]': '[0, 4]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 25])
ColorFeat = 'Environment Satisfaction'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 4]': '[0, 4]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 20], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
ColorFeat = 'Job Involvement'
Table = FeatCut(Feat, ColorFeat, Bins).replace({'(-1, 4]': '[0, 4]'})
PlotX(Table, Feat, ColorFeat, yLim = [0, 35], Colors = ['MediumSeaGreen','LightYellow','Bisque','DarkGreen'])
# For Preprocessing
df[Feat] = FeatBins(Inp = df[Feat], Bins = Bins)
del Feat, ColorFeat, Bins, Table
Now
_ = Data_Plot(Data)
_ = Distinct_Observations(Inp = df, Featured_Columns = Featured_Columns, YL = 10)
In the dataset, Attrition represents whether an employee is churned or not. We would like to create a predictive model that predicts this feature.
We need to convert categorical data to numeric data.
def dtypes_group(Inp):
Temp = Inp.dtypes.to_frame(name='Data Type').sort_values(by=['Data Type'])
Out = pd.DataFrame(index =Temp['Data Type'].unique(), columns = ['Features','Count'])
for c in Temp['Data Type'].unique():
Out.loc[Out.index == c, 'Features'] = [Temp.loc[Temp['Data Type'] == c].index.tolist()]
Out.loc[Out.index == c, 'Count'] = len(Temp.loc[Temp['Data Type'] == c].index.tolist())
Out = Out.reset_index(drop = False).rename(columns = {'index':'Data Type'})
Out['Data Type'] = Out['Data Type'].astype(str)
return Out
def dtype_sep(Inp):
Temp = Inp.dtypes.reset_index(drop = False)
Temp.columns = ['Features', 'Data Type']
Temp['Data Type'] = Temp['Data Type'].astype(str)
# Numeric_Columns
Numeric_Columns = Temp.loc[Temp['Data Type'].isin(['int64', 'int32', 'float64', 'float32']),'Features'].tolist()
# Categorical_Columns
Categorical_Columns = Temp.loc[Temp['Data Type'] == 'object','Features'].tolist()
return Numeric_Columns, Categorical_Columns
Numeric_Columns, Categorical_Columns = dtype_sep(Data)
display(dtypes_group(Data).style.hide_index())
| Data Type | Features | Count |
|---|---|---|
| int64 | ['Age', 'Years In Current Role', 'Years At Company', 'Training Times Last Year', 'Total Working Years', 'Stock Option Level', 'Standard Hours', 'Percent Salary Hike', 'Number Of Companies Worked', 'Monthly Rate', 'Monthly Income', 'Years Since Last Promotion', 'Job Level', 'Years With Current Manager', 'Hourly Rate', 'Daily Rate', 'Employee Number', 'Employee Count', 'Distance From Home'] | 19 |
| object | ['Attrition', 'Business Travel', 'Work Life Balance', 'Department', 'Education', 'Relationship Satisfaction', 'Performance Rating', 'Education Field', 'Over Time', 'Over 18', 'Environment Satisfaction', 'Gender', 'Job Satisfaction', 'Job Role', 'Job Involvement', 'Marital Status'] | 16 |
We can use LabelEncoder for converting categorical to numeric using. Therefore,
N = len(Categorical_Columns)
# Progressbar
Counter = 0
Progress_Bar = progressbar.ProgressBar(maxval= N, widgets=[progressbar.Bar('=', '|', '|'), progressbar.Percentage()])
#--------------- the loop ----------------------
Progress_Bar.start()
for i in range(N):
le = preprocessing.LabelEncoder()
le.fit(list(df[Categorical_Columns[i]]))
df[Categorical_Columns[i]] = le.transform(df[Categorical_Columns[i]])
del le
Progress_Bar.update(Counter)
Counter+=1
Progress_Bar.finish()
#--------------- End of the loop ---------------
# Finally, converting values of df back to integers.
df = df.astype(int)
display(dtypes_group(df).style.hide_index())
|=========================================================================|100%
| Data Type | Features | Count |
|---|---|---|
| int32 | ['Age', 'Monthly Rate', 'Number Of Companies Worked', 'Over 18', 'Over Time', 'Percent Salary Hike', 'Performance Rating', 'Monthly Income', 'Relationship Satisfaction', 'Stock Option Level', 'Total Working Years', 'Training Times Last Year', 'Work Life Balance', 'Years At Company', 'Years In Current Role', 'Standard Hours', 'Years Since Last Promotion', 'Marital Status', 'Job Role', 'Attrition', 'Business Travel', 'Daily Rate', 'Department', 'Distance From Home', 'Education', 'Job Satisfaction', 'Education Field', 'Employee Number', 'Environment Satisfaction', 'Gender', 'Hourly Rate', 'Job Involvement', 'Job Level', 'Employee Count', 'Years With Current Manager'] | 35 |
First, we remove features that have zero variance as these features don't add anything to our modeling.
# var
Temp = df[Featured_Columns].var().sort_values(ascending = False)
Temp = Temp.loc[Temp.round(16) ==0].index.tolist()
print(Back.BLACK + Fore.CYAN + Style.NORMAL + 'Features with variance zero' + Style.RESET_ALL + ':' + '%s' % ', '.join(Temp))
df = df.drop(columns = Temp)
del Temp
Features with variance zero:Employee Count, Over 18, Standard Hours
Aditional_Columns = [Target, 'Employee Number']
X = df.drop(columns = Aditional_Columns)
y = df[Target]
Moreover, high variance for some features can hurt our modeling process. For this reason, we would like to standardize features by removing the mean and scaling to unit variance. In this article, we demonstrated the benefits of scaling data using StandardScaler().
# scaling data
scaler = preprocessing.StandardScaler()
X_std = scaler.fit_transform(X)
X_std = pd.DataFrame(data = X_std, columns =X.columns)
del scaler
fig, ax = plt.subplots(2, 1, figsize=(18, 8))
ax = ax.ravel()
font = FontProperties()
font.set_weight('bold')
CP = [sns.color_palette("OrRd", 20), sns.color_palette("Greens", 20)]
Names = ['Variance of the Features', 'Variance of the Features (Standardized)']
Sets = [X, X_std]
kws = dict(label='Feature\nVariance', aspect=20, shrink= .3)
for i in range(len(ax)):
Temp = Sets[i].var().sort_values(ascending = False).to_frame(name= 'Variance').round(2).T
_ = sns.heatmap(Temp, ax=ax[i], annot=True, square=True, cmap = CP[i],
linewidths = 0.8, vmin=0, vmax=Temp.max(axis =1)[0], annot_kws={"size": 6},
cbar_kws=kws)
_ = ax[i].set_yticklabels('')
_ = ax[i].set_title(Names[i], fontproperties=font, fontsize = 16)
del Temp
del CP, Names, ax, fig, font, Sets, kws
Modifying dataset.
df[X.columns.tolist()] = X_std[X.columns.tolist()]
df.to_csv (Path.split(".")[0]+'_STD.csv', index = None, header=True)